Additional data from mm2 and mm7 were processed and included in “HCR.plus.ribo” data object to properly model batch effect and to further correct it.

load data

load("../../rdas/mm4plus/HRC.plus.ribo.rda")
load("../../rdas/mm4plus/mm4.ribo.rda")
load("../../rdas/mm4plus//HCR.ortho.geneLength.rda")

compute RPKM

use species specific gene length

HCR.plus.ribo.rpkm <- prop.table(as.matrix(HCR.plus.ribo[,-1]),2)*10^9
HCR.plus.ribo.rpkm <- cbind(HCR.plus.ribo.rpkm[,1:5]/HCR.geneLength$Chimp,HCR.plus.ribo.rpkm[,6:11]/HCR.geneLength$Human,HCR.plus.ribo.rpkm[,12:16]/HCR.geneLength$Rhesus,HCR.plus.ribo.rpkm[,17:26]/HCR.geneLength$Human)


mm4.ribo.rpkm <- prop.table(as.matrix(mm4.ribo[,-1]),2)*10^9
mm4.ribo.rpkm<-cbind(mm4.ribo.rpkm[,1:4]/HCR.geneLength$Chimp, mm4.ribo.rpkm[,5:8]/HCR.geneLength$Rhesus, mm4.ribo.rpkm[,9:12]/HCR.geneLength$Human)

filter data by requiring at 2/3 (8) of mm4 data have rpkm greater than 0

rpkm.filter<-c() 
for (i in 1:length(mm4.ribo.rpkm[,1])){
rpkm.filter[i]<-sum(mm4.ribo.rpkm[i,] == 0) < 5
}

HCR.plus.ribo.rpkm<-HCR.plus.ribo.rpkm[rpkm.filter,]
mm4.ribo.rpkm<-mm4.ribo.rpkm[rpkm.filter,]

quantile normalize and log2 transform data

library(limma)
HCR.plus.ribo.rpkm.Q<-normalizeQuantiles(HCR.plus.ribo.rpkm,ties = T)
mm4.ribo.rpkm.Q<-normalizeQuantiles(mm4.ribo.rpkm,ties = T)

is.na(HCR.plus.ribo.rpkm.Q)<-which(HCR.plus.ribo.rpkm.Q == 0)
log2.HCR.plus.ribo.rpkm.Q<-log2(HCR.plus.ribo.rpkm.Q)

is.na(mm4.ribo.rpkm.Q)<-which(mm4.ribo.rpkm.Q == 0)
log2.mm4.ribo.rpkm.Q<-log2(mm4.ribo.rpkm.Q)

limma remove batch effect

spec<-substring(colnames(log2.HCR.plus.ribo.rpkm.Q),1,1)
spec[17:26]<-rep("H",10)
spec.design <- model.matrix(~spec)
library(gdata)
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
## 
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
## 
## Attaching package: 'gdata'
## 
## The following object is masked from 'package:stats':
## 
##     nobs
## 
## The following object is masked from 'package:utils':
## 
##     object.size
read.xls("/Users/siddisis/Dropbox/work/Ribosome_profiling/Primate_comparison/primate_ribo_gender_mmbatch.xlsx")->cell.source
cell.source<-cell.source[,1:4]
batch.additional<-read.xls("/Users/siddisis/Dropbox/work/Ribosome_profiling/Primate_comparison/mmbatch_info_additional_YRI_for_batch_correction.xlsx")

batch<-c(as.character(cell.source$libmix),c(as.character(batch.additional$libmix[4:7]),as.character(batch.additional$libmix[-4:-7])))

batch.removed.limma.full<-removeBatchEffect(log2.HCR.plus.ribo.rpkm.Q,batch = as.factor(batch),design = spec.design)

limma, weighted least square

var.weight<-matrix(nrow =length(log2.HCR.plus.ribo.rpkm.Q[,1]),ncol = 26 )

for (i in 1:length(log2.HCR.plus.ribo.rpkm.Q[,1])){
#chimp.var<-var(log2.HCR.plus.ribo.rpkm.Q[i,1:5],na.rm = T)
mm1.var<-var(log2.HCR.plus.ribo.rpkm.Q[i,6:9],na.rm = T)
mm2.var<-var(log2.HCR.plus.ribo.rpkm.Q[i,c(10,21,22,23)],na.rm = T)
mm7.var<-var(log2.HCR.plus.ribo.rpkm.Q[i,c(11,24,25,26)],na.rm = T)
#rhesus.var<-var(log2.HCR.plus.ribo.rpkm.Q[i,12:16],na.rm = T)
mm4.var<-var(log2.HCR.plus.ribo.rpkm.Q[i,c(1:3,5,13:20)],na.rm = T)
mm8.var<-var(log2.HCR.plus.ribo.rpkm.Q[i,c(4,12)],na.rm = T)
var.weight[i,]<-c(rep(1/mm4.var,3),1/mm8.var,1/mm4.var,rep(1/mm1.var,4),1/mm2.var,1/mm7.var,1/mm8.var,rep(1/mm4.var,8),rep(1/mm2.var,3),rep(1/mm7.var,3))
}

batch.removed.limma.weighted<-removeBatchEffect(log2.HCR.plus.ribo.rpkm.Q,batch = as.factor(batch),design = spec.design,weight=var.weight)
## Warning: Partial NA coefficients for 9 probe(s)

Combat

library(sva)
## Loading required package: mgcv
## Loading required package: nlme
## Warning: package 'nlme' was built under R version 3.2.2
## This is mgcv 1.8-7. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## Creating a generic function for 'nchar' from package 'base' in package 'S4Vectors'
## Warning in .recacheSubclasses(def@className, def, doSubclasses, env):
## undefined subclass "externalRefMethod" of class "expressionORfunction";
## definition not updated
## Warning in .recacheSubclasses(def@className, def, doSubclasses, env):
## undefined subclass "externalRefMethod" of class "functionORNULL";
## definition not updated
## 
## Attaching package: 'genefilter'
## 
## The following object is masked from 'package:base':
## 
##     anyNA
batch.removed.combat.par<-ComBat(dat = log2.HCR.plus.ribo.rpkm.Q,mod = model.matrix(~ spec),batch = as.factor(batch),par.prior = TRUE,prior.plots = FALSE)
## Found 5 batches
## Adjusting for 2 covariate(s) or covariate level(s)
## Found 580 Missing Data Values
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
batch.removed.combat.nonpar<-ComBat(dat = log2.HCR.plus.ribo.rpkm.Q,mod = model.matrix(~ spec),batch = as.factor(batch),par.prior = FALSE,prior.plots = FALSE)
## Found 5 batches
## Adjusting for 2 covariate(s) or covariate level(s)
## Found 580 Missing Data Values
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding nonparametric adjustments
## Adjusting the Data

study design

table(batch,spec)
##      spec
## batch C H R
##   mm1 0 4 0
##   mm2 0 4 0
##   mm4 4 4 4
##   mm7 0 4 0
##   mm8 1 0 1

pca

library(ggplot2)
library(reshape2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.2
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:gdata':
## 
##     combine, first, last
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(gdata)

# full data
pc<-prcomp(t(na.omit(log2.HCR.plus.ribo.rpkm.Q)),center = T,scale = T)
barplot((pc$sdev)^2/sum((pc$sdev)^2), names.arg =  colnames(pc$x))

temp.data<-as.data.frame(cbind(melt(pc$x),pc$x[,1],spec,batch))
## Warning in data.frame(..., check.names = FALSE): row names were found from
## a short variable and have been discarded
names(temp.data)<-c("ID","PC","value","pc1","species","batch")

temp.data %>% filter(as.numeric(temp.data$PC) < 10) %>% ggplot(aes(x=pc1,y=value))+geom_point(aes(color = species, shape = batch))+facet_wrap(~PC)+labs(y="pcX")

temp.data %>% filter(as.numeric(temp.data$PC) < 10) %>% ggplot(aes(x=pc1,y=value))+geom_point(aes(shape = species))+facet_wrap(~PC)+labs(y="pcX")

temp.data %>% filter(as.numeric(temp.data$PC) < 10) %>% ggplot(aes(x=pc1,y=value))+geom_point(aes(shape = batch))+facet_wrap(~PC)+labs(y="pcX")

# mm4
pc<-prcomp(t(na.omit(log2.mm4.ribo.rpkm.Q)),center = T,scale = T)
barplot((pc$sdev)^2/sum((pc$sdev)^2), names.arg =  colnames(pc$x))

mm4.spec<-substring(rownames(pc$x),1,1)
mm4.spec<-c(mm4.spec[1:8],rep("H",4))
temp.data<-as.data.frame(cbind(melt(pc$x),pc$x[,1],mm4.spec))
## Warning in data.frame(..., check.names = FALSE): row names were found from
## a short variable and have been discarded
names(temp.data)<-c("ID","PC","value","pc1","species")

temp.data %>% filter(as.numeric(temp.data$PC) < 10) %>% ggplot(aes(x=pc1,y=value))+geom_point(aes(shape = species))+facet_wrap(~PC)+labs(y="pcX")

# combat nonparametric corrected all 26
pc<-prcomp(t(na.omit(batch.removed.combat.nonpar)),center = T,scale = T)
barplot((pc$sdev)^2/sum((pc$sdev)^2), names.arg =  colnames(pc$x))

temp.data<-as.data.frame(cbind(melt(pc$x),pc$x[,1],spec,batch))
## Warning in data.frame(..., check.names = FALSE): row names were found from
## a short variable and have been discarded
names(temp.data)<-c("ID","PC","value","pc1","species","batch")

temp.data %>% filter(as.numeric(temp.data$PC) < 10) %>% ggplot(aes(x=pc1,y=value))+geom_point(aes(color = species, shape=batch))+facet_wrap(~PC)

# combat nonparametric corrected, 16 used 
pc<-prcomp(t(na.omit(batch.removed.combat.nonpar[,1:16])),center = T,scale = T)
barplot((pc$sdev)^2/sum((pc$sdev)^2), names.arg =  colnames(pc$x))

temp.data<-as.data.frame(cbind(melt(pc$x),pc$x[,1],spec[1:16],batch[1:16]))
## Warning in data.frame(..., check.names = FALSE): row names were found from
## a short variable and have been discarded
names(temp.data)<-c("ID","PC","value","pc1","species","batch")

temp.data %>% filter(as.numeric(temp.data$PC) < 10) %>% ggplot(aes(x=pc1,y=value))+geom_point(aes(color = species, shape=batch))+facet_wrap(~PC)+labs(y="PCX")

temp.data %>% filter(as.numeric(temp.data$PC) < 10) %>% ggplot(aes(x=pc1,y=value))+geom_point(aes(shape = species))+facet_wrap(~PC)+labs(y="pcX")

temp.data %>% filter(as.numeric(temp.data$PC) < 10) %>% ggplot(aes(x=pc1,y=value))+geom_point(aes(shape = batch))+facet_wrap(~PC)+labs(y="pcX")

## pc1 vs pc2

pc.mm4<-prcomp(t(na.omit(log2.mm4.ribo.rpkm.Q)),center = T,scale = T)
pc1.mm4<-pc.mm4$x[,1]
pc2.mm4<-pc.mm4$x[,2]

mm4.pc.data<-as.data.frame(cbind(pc1.mm4,pc2.mm4))
mm4.pc.data<-as.data.frame(cbind(rownames(mm4.pc.data),rep("mm4",12),mm4.pc.data,c(rep("C",4),rep("R",4),rep("H",4)),rep("mm4",12)),stringsAsFactors=F)
names(mm4.pc.data)<-c("ID","data","PC1","PC2","species","batch")

pc.ori<-prcomp(t(na.omit(log2.HCR.plus.ribo.rpkm.Q[,1:16])),center = T,scale = T)
pc1.ori<-pc.ori$x[,1]
pc2.ori<-pc.ori$x[,2]

pc.limma<-prcomp(t(na.omit(batch.removed.limma.full[,1:16])),center = T,scale = T)
pc1.limma<-pc.limma$x[,1]
pc2.limma<-pc.limma$x[,2]

pc.limma.weighted<-prcomp(t(na.omit(batch.removed.limma.weighted[,1:16])),center = T,scale = T)
pc1.limma.wt<-pc.limma.weighted$x[,1]
pc2.limma.wt<-pc.limma.weighted$x[,2]

pc.combat.par<-prcomp(t(na.omit(batch.removed.combat.par[,1:16])),center = T,scale = T)
pc1.combat.par<-pc.combat.par$x[,1]
pc2.combat.par<-pc.combat.par$x[,2]

pc.combat.nonpar<-prcomp(t(na.omit(batch.removed.combat.nonpar[,1:16])),center = T,scale = T)
pc1.combat.nonpar<-pc.combat.nonpar$x[,1]
pc2.combat.nonpar<-pc.combat.nonpar$x[,2]

pc1.data<-cbind(pc1.ori,pc1.limma,pc1.limma.wt,pc1.combat.par,pc1.combat.nonpar)
colnames(pc1.data)<-c("original","limma","limma.wt","combat.par","combat.nonpar")
pc1.data<-melt(pc1.data)
pc2.data<-melt(cbind(pc2.ori,pc2.limma,pc2.limma.wt,pc2.combat.par,pc2.combat.nonpar))

pc.data<-cbind(pc1.data,pc2.data$value,spec[1:16],batch[1:16])
names(pc.data)<-c("ID","data","PC1","PC2","species","batch")

#pc.data %>% ggplot(aes(x=PC1,y=PC2))+geom_point(aes(color = species, shape=batch))+facet_wrap(~data)+theme(panel.grid.minor=element_blank())

rbind(mm4.pc.data,pc.data)%>% ggplot(aes(x=PC1,y=PC2))+geom_point(aes(color = species, shape=batch))+facet_wrap(~data)+theme(panel.grid.minor=element_blank())

rbind(mm4.pc.data,pc.data)%>% ggplot(aes(x=PC1,y=PC2))+geom_point(aes( shape=species))+facet_wrap(~data)+theme(panel.grid.minor=element_blank())

rbind(mm4.pc.data,pc.data)%>% ggplot(aes(x=PC1,y=PC2))+geom_point(aes( shape=batch))+facet_wrap(~data)+theme(panel.grid.minor=element_blank())

DE test beta without batch correction

log2.HCR.ribo.rpkm.Q<-log2.HCR.plus.ribo.rpkm.Q[,1:16]

species.label<-as.factor(substring(colnames(log2.HCR.ribo.rpkm.Q),1,1))

ori.lm.fit<-lmFit(log2.HCR.ribo.rpkm.Q,design = model.matrix(~ species.label))
oriHvC.effect<-ori.lm.fit$coefficient[,2]
oriRvC.effect<-ori.lm.fit$coefficient[,3]

ori.de.test<-eBayes(ori.lm.fit)

oriHvC.pval<-ori.de.test$p.value[,2]
oriRvC.pval<-ori.de.test$p.value[,3]

DE test, beta for mm4

species.label<-as.factor(substring(colnames(log2.mm4.ribo.rpkm.Q),1,1))

mm4.lm.fit<-lmFit(log2.mm4.ribo.rpkm.Q,design = model.matrix(~ species.label))
mm4HvC.effect<-mm4.lm.fit$coefficient[,3]
mm4RvC.effect<-mm4.lm.fit$coefficient[,2]

mm4.de.test<-eBayes(mm4.lm.fit)

mm4HvC.pval<-mm4.de.test$p.value[,3]
mm4RvC.pval<-mm4.de.test$p.value[,2]

DE test beta for batch corrected set

species.label<-as.factor(substring(colnames(log2.HCR.ribo.rpkm.Q),1,1))

# limma

HCR.ribo.batch.corrected<-batch.removed.limma.full[,1:16]

limma.lm.fit<-lmFit(HCR.ribo.batch.corrected,design = model.matrix(~ species.label))
limmaHvC.effect<-limma.lm.fit$coefficient[,2]
limmaRvC.effect<-limma.lm.fit$coefficient[,3]

limma.de.test<-eBayes(limma.lm.fit)

limmaHvC.pval<-limma.de.test$p.value[,2]
limmaRvC.pval<-limma.de.test$p.value[,3]

# weighted limma

HCR.ribo.batch.corrected<-batch.removed.limma.weighted[,1:16]
limma.wt.lm.fit<-lmFit(HCR.ribo.batch.corrected,design = model.matrix(~ species.label))
limma.wtHvC.effect<-limma.wt.lm.fit$coefficient[,2]
limma.wtRvC.effect<-limma.wt.lm.fit$coefficient[,3]

limma.wt.de.test<-eBayes(limma.wt.lm.fit)

limma.wtHvC.pval<-limma.wt.de.test$p.value[,2]
limma.wtRvC.pval<-limma.wt.de.test$p.value[,3]

# combat parametric

HCR.ribo.batch.corrected<-batch.removed.combat.par[,1:16]

combat.par.lm.fit<-lmFit(HCR.ribo.batch.corrected,design = model.matrix(~ species.label))
combat.parHvC.effect<-combat.par.lm.fit$coefficient[,2]
combat.parRvC.effect<-combat.par.lm.fit$coefficient[,3]

combat.par.de.test<-eBayes(combat.par.lm.fit)

combat.parHvC.pval<-combat.par.de.test$p.value[,2]
combat.parRvC.pval<-combat.par.de.test$p.value[,3]

# combat non-parametric

HCR.ribo.batch.corrected<-batch.removed.combat.nonpar[,1:16]

combat.nonpar.lm.fit<-lmFit(HCR.ribo.batch.corrected,design = model.matrix(~ species.label))
combat.nonparHvC.effect<-combat.nonpar.lm.fit$coefficient[,2]
combat.nonparRvC.effect<-combat.nonpar.lm.fit$coefficient[,3]

combat.nonpar.de.test<-eBayes(combat.nonpar.lm.fit)

combat.nonparHvC.pval<-combat.nonpar.de.test$p.value[,2]
combat.nonparRvC.pval<-combat.nonpar.de.test$p.value[,3]

Spearman rank correlation coefficient of test statistics

HvsC.pval<-c(
cor(mm4HvC.pval,oriHvC.pval, method = "spearman"),
cor(mm4HvC.pval,limmaHvC.pval, method = "spearman"),
cor(mm4HvC.pval,limma.wtHvC.pval, method = "spearman"),
cor(mm4HvC.pval,combat.parHvC.pval, method = "spearman"),
cor(mm4HvC.pval,combat.nonparHvC.pval, method = "spearman")
)
RvsC.pval<-c(
cor(mm4RvC.pval,oriRvC.pval, method = "spearman"),
cor(mm4RvC.pval,limmaRvC.pval, method = "spearman"),
cor(mm4RvC.pval,limma.wtRvC.pval, method = "spearman"),
cor(mm4RvC.pval,combat.parRvC.pval, method = "spearman"),
cor(mm4RvC.pval,combat.nonparRvC.pval, method = "spearman")
)

pval.cor<-as.data.frame(rbind(HvsC.pval,RvsC.pval))
names(pval.cor)<-c("original","limma","limma.wt","combat.par","combat.nonpar")

# correlation of DE test p values between mm4 and the data specified in column name 
pval.cor
##            original     limma  limma.wt combat.par combat.nonpar
## HvsC.pval 0.6120482 0.9207119 0.9207645  0.9053674     0.9122452
## RvsC.pval 0.9110777 0.9159135 0.9157792  0.9287111     0.9392383
HvsC.beta<-c(
cor(mm4HvC.effect,oriHvC.effect, method = "spearman"),
cor(mm4HvC.effect,limmaHvC.effect, method = "spearman"),
cor(mm4HvC.effect,limma.wtHvC.effect, method = "spearman"),
cor(mm4HvC.effect,combat.parHvC.effect, method = "spearman"),
cor(mm4HvC.effect,combat.nonparHvC.effect, method = "spearman")
)
RvsC.beta<-c(
cor(mm4RvC.effect,oriRvC.effect, method = "spearman"),
cor(mm4RvC.effect,limmaRvC.effect, method = "spearman"),
cor(mm4RvC.effect,limma.wtRvC.effect, method = "spearman"),
cor(mm4RvC.effect,combat.parRvC.effect, method = "spearman"),
cor(mm4RvC.effect,combat.nonparRvC.effect, method = "spearman")
)

beta.cor<-as.data.frame(rbind(HvsC.beta,RvsC.beta))
names(beta.cor)<-c("original","limma","limma.wt","combat.par","combat.nonpar")

# correlation of DE test beta values between mm4 and the data specified in column name 
beta.cor
##            original     limma  limma.wt combat.par combat.nonpar
## HvsC.beta 0.8140696 0.9777403 0.9777286  0.9697329     0.9739848
## RvsC.beta 0.9768566 0.9768621 0.9768621  0.9803080     0.9848971

scatter plot to visualize effect of batch correction

#human vs chimp
# beta
ggplot(mapping = aes(x=oriHvC.effect,y=mm4HvC.effect))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("original")+ylab("mm4")+ggtitle("beta HC")

ggplot(mapping = aes(x=limmaHvC.effect,y=mm4HvC.effect))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("limma")+ylab("mm4")+ggtitle("beta HC")

ggplot(mapping = aes(x=limma.wtHvC.effect,y=mm4HvC.effect))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("limma.wt")+ylab("mm4")+ggtitle("beta HC")

ggplot(mapping = aes(x=combat.parHvC.effect,y=mm4HvC.effect))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("combat.par")+ylab("mm4")+ggtitle("beta HC")

ggplot(mapping = aes(x=combat.nonparHvC.effect,y=mm4HvC.effect))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("combat.nonpar")+ylab("mm4")+ggtitle("beta HC")

# pvalue
ggplot(mapping = aes(x=-log10(oriHvC.pval),y=-log10(mm4HvC.pval)))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("original")+ylab("mm4")+ggtitle("-log10 pvalue HC")

ggplot(mapping = aes(x=-log10(limmaHvC.pval),y=-log10(mm4HvC.pval)))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("limma")+ylab("mm4")+ggtitle("-log10 pvalue HC")

ggplot(mapping = aes(x=-log10(limma.wtHvC.pval),y=-log10(mm4HvC.pval)))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("limma.wt")+ylab("mm4")+ggtitle("-log10 pvalue HC")

ggplot(mapping = aes(x=-log10(combat.parHvC.pval),y=-log10(mm4HvC.pval)))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("combat.par")+ylab("mm4")+ggtitle("-log10 pvalue HC")

ggplot(mapping = aes(x=-log10(combat.nonparHvC.pval),y=-log10(mm4HvC.pval)))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("combat.nonpar")+ylab("mm4")+ggtitle("-log10 pvalue HC")

#Rhesus vs Chimp

# beta
ggplot(mapping = aes(x=oriRvC.effect,y=mm4RvC.effect))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("original")+ylab("mm4")+ggtitle("beta RC")

ggplot(mapping = aes(x=limmaRvC.effect,y=mm4RvC.effect))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("limma")+ylab("mm4")+ggtitle("beta RC")

ggplot(mapping = aes(x=limma.wtRvC.effect,y=mm4RvC.effect))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("limma.wt")+ylab("mm4")+ggtitle("beta RC")

ggplot(mapping = aes(x=combat.parRvC.effect,y=mm4RvC.effect))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("combat.par")+ylab("mm4")+ggtitle("beta RC")

ggplot(mapping = aes(x=combat.nonparRvC.effect,y=mm4RvC.effect))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("combat.nonpar")+ylab("mm4")+ggtitle("beta RC")

# pvalue
ggplot(mapping = aes(x=-log10(oriRvC.pval),y=-log10(mm4RvC.pval)))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("original")+ylab("mm4")+ggtitle("-log10 pvalue RC")

ggplot(mapping = aes(x=-log10(limmaRvC.pval),y=-log10(mm4RvC.pval)))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("limma")+ylab("mm4")+ggtitle("-log10 pvalue RC")

ggplot(mapping = aes(x=-log10(limma.wtRvC.pval),y=-log10(mm4RvC.pval)))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("limma.wt")+ylab("mm4")+ggtitle("-log10 pvalue RC")

ggplot(mapping = aes(x=-log10(combat.parRvC.pval),y=-log10(mm4RvC.pval)))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("combat.par")+ylab("mm4")+ggtitle("-log10 pvalue RC")

ggplot(mapping = aes(x=-log10(combat.nonparRvC.pval),y=-log10(mm4RvC.pval)))+geom_point(size=1, alpha=0.3)+geom_abline()+xlab("combat.nonpar")+ylab("mm4")+ggtitle("-log10 pvalue RC")

function for roc curve

roc.curve<-function(pvalue1,pvalue2,cutoff,breaks,color){
  TPR<-c()
  FPR<-c()
  truePositive <- pvalue1 < cutoff
  trueNegative <- pvalue1 > cutoff
  threshold<-seq(0,1,1/breaks)
  
  for (i in 1:breaks+1){
    Detected <- pvalue2 < threshold[i]
    TPR[i] <- length(which(truePositive & Detected))/length(which(truePositive))
    FPR[i] <- length(which(trueNegative & Detected))/length(which(trueNegative))
    
  }
  
  plot(FPR,TPR,xlim=c(0,1),ylim=c(0,1),col=color,pch=16,cex=0.4)
  
}

comparing between methods

roc.curve(mm4HvC.pval,oriHvC.pval,0.01,1000,"black")
par(new=TRUE)
roc.curve(mm4HvC.pval,limmaHvC.pval,0.01,1000,"green")
par(new=TRUE)
roc.curve(mm4HvC.pval,limma.wtHvC.pval,0.01,1000,"yellow")
par(new=TRUE)
roc.curve(mm4HvC.pval,combat.parHvC.pval,0.01,1000,"blue")
par(new=TRUE)
roc.curve(mm4HvC.pval,combat.nonparHvC.pval,0.01,1000,"red")
legend("bottomright",c("original","limma","limma.wt","combat.par","combat.nonpar"),col=c("black","green","yellow","blue","red"),pch = 16 ,cex = 0.75, pt.cex = 1,box.lwd = 0, ncol = 2)

roc.curve(mm4RvC.pval,oriRvC.pval,0.01,1000,"black")
par(new=TRUE)
roc.curve(mm4RvC.pval,limmaRvC.pval,0.01,1000,"green")
par(new=TRUE)
roc.curve(mm4RvC.pval,limma.wtRvC.pval,0.01,1000,"yellow")
par(new=TRUE)
roc.curve(mm4RvC.pval,combat.parRvC.pval,0.01,1000,"blue")
par(new=TRUE)
roc.curve(mm4RvC.pval,combat.nonparRvC.pval,0.01,1000,"red")
legend("bottomright",c("ori","lm","lmwt","combat.p","combat.np"),col=c("black","green","yellow","blue","red"), pch = 16 ,cex = 0.75, pt.cex = 1,box.lwd = 0, ncol = 2)

Venn diagram ori, mm4, corrected (combat, nonparametric)

mm4HvC.top500<-mm4HvC.pval<mm4HvC.pval[order(mm4HvC.pval)[501]]
oriHvC.top500<-oriHvC.pval<oriHvC.pval[order(oriHvC.pval)[501]]
combat.nonparHvC.top500<-combat.nonparHvC.pval<combat.nonparHvC.pval[order(combat.nonparHvC.pval)[501]]
HC.pval.top500<-cbind(mm4HvC.top500,oriHvC.top500,combat.nonparHvC.top500)
colnames(HC.pval.top500)<-c("mm4HC","oriHC","corHC")
vennDiagram(HC.pval.top500)

mm4RvC.top500<-mm4RvC.pval<mm4RvC.pval[order(mm4RvC.pval)[501]]
oriRvC.top500<-oriRvC.pval<oriRvC.pval[order(oriRvC.pval)[501]]
combat.nonparRvC.top500<-combat.nonparRvC.pval<combat.nonparRvC.pval[order(combat.nonparRvC.pval)[501]]
RC.pval.top500<-cbind(mm4RvC.top500,oriRvC.top500,combat.nonparRvC.top500)
colnames(RC.pval.top500)<-c("mm4RC","oriRC","corRC")
vennDiagram(RC.pval.top500)

mm4HvC.top1000<-mm4HvC.pval<mm4HvC.pval[order(mm4HvC.pval)[1001]]
oriHvC.top1000<-oriHvC.pval<oriHvC.pval[order(oriHvC.pval)[1001]]
combat.nonparHvC.top1000<-combat.nonparHvC.pval<combat.nonparHvC.pval[order(combat.nonparHvC.pval)[1001]]
HC.pval.top1000<-cbind(mm4HvC.top1000,oriHvC.top1000,combat.nonparHvC.top1000)
colnames(HC.pval.top1000)<-c("mm4","original","corrected")
vennDiagram(HC.pval.top1000)

mm4RvC.top1000<-mm4RvC.pval<mm4RvC.pval[order(mm4RvC.pval)[1001]]
oriRvC.top1000<-oriRvC.pval<oriRvC.pval[order(oriRvC.pval)[1001]]
combat.nonparRvC.top1000<-combat.nonparRvC.pval<combat.nonparRvC.pval[order(combat.nonparRvC.pval)[1001]]
RC.pval.top1000<-cbind(mm4RvC.top1000,oriRvC.top1000,combat.nonparRvC.top1000)
colnames(RC.pval.top1000)<-c("mm4RC","oriRC","corRC")
vennDiagram(RC.pval.top1000)